import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns; sns.set()
from numpy import *
from scipy import stats
from pandas.plotting import scatter_matrix
import sklearn
import warnings
data = pd.read_excel(r'Attrition Data Exercise.xlsx')
data.columns
Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
'DistanceFromHome', 'Education', 'Educ_bucket', 'EducationField',
'EmployeeCount', 'EmployeeNumber', 'EnvironmentSatisfaction',
'EnvironmentSatisfaction_Bucket', 'Gender', 'HourlyRate',
'JobInvolvement', 'JobInvolvement_bucket', 'JobLevel', 'JobRole',
'JobSatisfaction', 'JobSatisfaction_bucket', 'MaritalStatus',
'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'Over18',
'OverTime', 'PercentSalaryHike', 'PerformanceRating',
'RelationshipSatisfaction', 'RelationshipSatisfaction_bucket',
'StandardHours', 'StockOptionLevel', 'TotalWorkingYears',
'TrainingTimesLastYear', 'WorkLifeBalance', 'WorkLifeBalance_bucket',
'Company Tenure (yrs)', 'Company Tenure (yrs)_Bucket',
'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager',
'Location'],
dtype='object')
data.head(5)
| Age | Attrition | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | Educ_bucket | EducationField | EmployeeCount | ... | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | WorkLifeBalance_bucket | Company Tenure (yrs) | Company Tenure (yrs)_Bucket | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | Location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 41 | Yes | Travel_Rarely | 1102 | Sales | 1 | 2 | College | Life Sciences | 1 | ... | 8 | 0 | 1 | Bad | 6 | 5 to 10 years | 4 | 0 | 5 | India |
| 1 | 49 | No | Travel_Frequently | 279 | Research & Development | 8 | 1 | Below College | Life Sciences | 1 | ... | 10 | 3 | 3 | Better | 10 | 10 to 15 yrs | 7 | 1 | 7 | China |
| 2 | 37 | Yes | Travel_Rarely | 1373 | Research & Development | 2 | 2 | College | Other | 1 | ... | 7 | 3 | 3 | Better | 0 | Less than 2 years | 0 | 0 | 0 | India |
| 3 | 33 | No | Travel_Frequently | 1392 | Research & Development | 3 | 4 | Masters | Life Sciences | 1 | ... | 8 | 3 | 3 | Better | 8 | 5 to 10 years | 7 | 3 | 0 | India |
| 4 | 27 | No | Travel_Rarely | 591 | Research & Development | 2 | 1 | Below College | Medical | 1 | ... | 6 | 3 | 3 | Better | 2 | 3 to 5 yrs | 2 | 2 | 2 | India |
5 rows × 43 columns
data.DailyRate = data.DailyRate.round(-2)
data.HourlyRate = data.HourlyRate.round(-1)
data.MonthlyIncome = data.MonthlyIncome.round(-3)
data.MonthlyRate = data.MonthlyRate.round(-3)
data.Age = data.Age.round(-1)
data_original = data
Num_val = {'Yes':1, 'No':0}
data['Attrition'] = data["Attrition"].apply(lambda x: Num_val[x])
data.iloc[:, 30 : 40]
| RelationshipSatisfaction_bucket | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | WorkLifeBalance_bucket | Company Tenure (yrs) | Company Tenure (yrs)_Bucket | YearsInCurrentRole | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Low | 80 | 0 | 8 | 0 | 1 | Bad | 6 | 5 to 10 years | 4 |
| 1 | Very high | 80 | 1 | 10 | 3 | 3 | Better | 10 | 10 to 15 yrs | 7 |
| 2 | Medium | 80 | 0 | 7 | 3 | 3 | Better | 0 | Less than 2 years | 0 |
| 3 | High | 80 | 0 | 8 | 3 | 3 | Better | 8 | 5 to 10 years | 7 |
| 4 | Very high | 80 | 1 | 6 | 3 | 3 | Better | 2 | 3 to 5 yrs | 2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1465 | High | 80 | 1 | 17 | 3 | 3 | Better | 5 | 5 to 10 years | 2 |
| 1466 | Low | 80 | 1 | 9 | 5 | 3 | Better | 7 | 5 to 10 years | 7 |
| 1467 | Medium | 80 | 1 | 6 | 0 | 3 | Better | 6 | 5 to 10 years | 2 |
| 1468 | Very high | 80 | 0 | 17 | 3 | 2 | Good | 9 | 5 to 10 years | 6 |
| 1469 | Low | 80 | 0 | 6 | 3 | 4 | Best | 4 | 3 to 5 yrs | 3 |
1470 rows × 10 columns
data.describe()
| Age | Attrition | DailyRate | DistanceFromHome | Education | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | HourlyRate | JobInvolvement | ... | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | Company Tenure (yrs) | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.0 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | ... | 1470.000000 | 1470.0 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 |
| mean | 37.061224 | 0.161224 | 801.836735 | 9.192517 | 2.912925 | 1.0 | 1024.865306 | 2.721769 | 65.986395 | 2.729932 | ... | 2.712245 | 80.0 | 0.793878 | 11.279592 | 2.799320 | 2.761224 | 7.008163 | 4.229252 | 2.187755 | 4.123129 |
| std | 9.667885 | 0.367863 | 406.000802 | 8.106864 | 1.024165 | 0.0 | 602.024335 | 1.093082 | 20.796262 | 0.711561 | ... | 1.081209 | 0.0 | 0.852077 | 7.780782 | 1.289271 | 0.706476 | 6.126525 | 3.623137 | 3.222430 | 3.568136 |
| min | 20.000000 | 0.000000 | 100.000000 | 1.000000 | 1.000000 | 1.0 | 1.000000 | 1.000000 | 30.000000 | 1.000000 | ... | 1.000000 | 80.0 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 30.000000 | 0.000000 | 500.000000 | 2.000000 | 2.000000 | 1.0 | 491.250000 | 2.000000 | 50.000000 | 2.000000 | ... | 2.000000 | 80.0 | 0.000000 | 6.000000 | 2.000000 | 2.000000 | 3.000000 | 2.000000 | 0.000000 | 2.000000 |
| 50% | 40.000000 | 0.000000 | 800.000000 | 7.000000 | 3.000000 | 1.0 | 1020.500000 | 3.000000 | 70.000000 | 3.000000 | ... | 3.000000 | 80.0 | 1.000000 | 10.000000 | 3.000000 | 3.000000 | 5.000000 | 3.000000 | 1.000000 | 3.000000 |
| 75% | 40.000000 | 0.000000 | 1200.000000 | 14.000000 | 4.000000 | 1.0 | 1555.750000 | 4.000000 | 80.000000 | 3.000000 | ... | 4.000000 | 80.0 | 1.000000 | 15.000000 | 3.000000 | 3.000000 | 9.000000 | 7.000000 | 3.000000 | 7.000000 |
| max | 60.000000 | 1.000000 | 1500.000000 | 29.000000 | 5.000000 | 1.0 | 2068.000000 | 4.000000 | 100.000000 | 4.000000 | ... | 4.000000 | 80.0 | 3.000000 | 40.000000 | 6.000000 | 4.000000 | 40.000000 | 18.000000 | 15.000000 | 17.000000 |
8 rows × 27 columns
threshold = 0.0
data = data.drop(data.std()[data.std() == threshold].index.values, axis=1)
data = data.drop('EmployeeNumber', axis = 1)
data = data.drop(['Educ_bucket', 'EnvironmentSatisfaction_Bucket', 'JobInvolvement_bucket', 'JobSatisfaction_bucket', 'RelationshipSatisfaction_bucket', 'WorkLifeBalance_bucket', 'Company Tenure (yrs)_Bucket'], axis = 1)
data_new = data
fig, ax=plt.subplots(figsize=(200,200))
plt.title('Correaltion Plot',fontsize=50)
sns.heatmap(data.corr(), mask=np.zeros_like(data.corr(), dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
square=True, ax=ax, annot=True,linewidths=3)
plt.xticks(rotation=90, fontsize=100)
plt.yticks(rotation=0, fontsize=100)
plt.show()
Y = data.Attrition
Y.shape
(1470,)
X = data.drop('Attrition', axis = 1)
X.iloc[:, 20 : 50]
| PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | Company Tenure (yrs) | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | Location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 11 | 3 | 1 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 | India |
| 1 | 23 | 4 | 4 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 | China |
| 2 | 15 | 3 | 2 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 | India |
| 3 | 11 | 3 | 3 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 | India |
| 4 | 12 | 3 | 4 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 | India |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1465 | 17 | 3 | 3 | 1 | 17 | 3 | 3 | 5 | 2 | 0 | 3 | USA |
| 1466 | 15 | 3 | 1 | 1 | 9 | 5 | 3 | 7 | 7 | 1 | 7 | USA |
| 1467 | 20 | 4 | 2 | 1 | 6 | 0 | 3 | 6 | 2 | 0 | 3 | USA |
| 1468 | 14 | 3 | 4 | 0 | 17 | 3 | 2 | 9 | 6 | 0 | 8 | USA |
| 1469 | 12 | 3 | 1 | 0 | 6 | 3 | 4 | 4 | 3 | 1 | 2 | USA |
1470 rows × 12 columns
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder()
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
X.iloc[:, 0 : 20]
| Age | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EnvironmentSatisfaction | Gender | HourlyRate | JobInvolvement | JobLevel | JobRole | JobSatisfaction | MaritalStatus | MonthlyIncome | MonthlyRate | NumCompaniesWorked | Over18 | OverTime | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 40 | Travel_Rarely | 1100 | Sales | 1 | 2 | Life Sciences | 2 | Female | 90 | 3 | 2 | Sales Executive | 4 | Single | 6000 | 19000 | 8 | Y | Yes |
| 1 | 50 | Travel_Frequently | 300 | Research & Development | 8 | 1 | Life Sciences | 3 | Male | 60 | 2 | 2 | Research Scientist | 2 | Married | 5000 | 25000 | 1 | Y | No |
| 2 | 40 | Travel_Rarely | 1400 | Research & Development | 2 | 2 | Other | 4 | Male | 90 | 2 | 1 | Laboratory Technician | 3 | Single | 2000 | 2000 | 6 | Y | Yes |
| 3 | 30 | Travel_Frequently | 1400 | Research & Development | 3 | 4 | Life Sciences | 4 | Female | 60 | 3 | 1 | Research Scientist | 3 | Married | 3000 | 23000 | 1 | Y | Yes |
| 4 | 30 | Travel_Rarely | 600 | Research & Development | 2 | 1 | Medical | 1 | Male | 40 | 3 | 1 | Laboratory Technician | 2 | Married | 3000 | 17000 | 9 | Y | No |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1465 | 40 | Travel_Frequently | 900 | Research & Development | 23 | 2 | Medical | 3 | Male | 40 | 4 | 2 | Laboratory Technician | 4 | Married | 3000 | 12000 | 4 | Y | No |
| 1466 | 40 | Travel_Rarely | 600 | Research & Development | 6 | 1 | Medical | 4 | Male | 40 | 2 | 3 | Healthcare Representative | 1 | Married | 10000 | 21000 | 4 | Y | No |
| 1467 | 30 | Travel_Rarely | 200 | Research & Development | 4 | 3 | Life Sciences | 2 | Male | 90 | 4 | 2 | Manufacturing Director | 2 | Married | 6000 | 5000 | 1 | Y | Yes |
| 1468 | 50 | Travel_Frequently | 1000 | Sales | 2 | 3 | Medical | 4 | Male | 60 | 2 | 2 | Sales Executive | 2 | Married | 5000 | 13000 | 2 | Y | No |
| 1469 | 30 | Travel_Rarely | 600 | Research & Development | 8 | 3 | Medical | 2 | Male | 80 | 4 | 2 | Laboratory Technician | 3 | Married | 4000 | 10000 | 2 | Y | No |
1470 rows × 20 columns
numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns
numerical_ix
Index(['Age', 'DailyRate', 'DistanceFromHome', 'Education',
'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement', 'JobLevel',
'JobSatisfaction', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction',
'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
'WorkLifeBalance', 'Company Tenure (yrs)', 'YearsInCurrentRole',
'YearsSinceLastPromotion', 'YearsWithCurrManager'],
dtype='object')
column_trans = make_column_transformer(
(OneHotEncoder(), ['Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'Over18', 'OverTime', 'Location']),
(OrdinalEncoder(), ['BusinessTravel']),
(StandardScaler(), ['Age', 'DailyRate', 'DistanceFromHome', 'Education',
'EnvironmentSatisfaction', 'HourlyRate',
'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome',
'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike',
'PerformanceRating', 'RelationshipSatisfaction',
'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
'WorkLifeBalance', 'Company Tenure (yrs)', 'YearsInCurrentRole',
'YearsSinceLastPromotion', 'YearsWithCurrManager']),
remainder = 'passthrough')
X_new = column_trans.fit_transform(X)
column_trans.get_feature_names
<bound method ColumnTransformer.get_feature_names of ColumnTransformer(remainder='passthrough',
transformers=[('onehotencoder', OneHotEncoder(),
['Department', 'EducationField', 'Gender',
'JobRole', 'MaritalStatus', 'Over18',
'OverTime', 'Location']),
('ordinalencoder', OrdinalEncoder(),
['BusinessTravel']),
('standardscaler', StandardScaler(),
['Age', 'DailyRate', 'DistanceFromHome',
'Education', 'EnvironmentSatisfacti...
'HourlyRate', 'JobInvolvement', 'JobLevel',
'JobSatisfaction', 'MonthlyIncome',
'MonthlyRate', 'NumCompaniesWorked',
'PercentSalaryHike', 'PerformanceRating',
'RelationshipSatisfaction',
'StockOptionLevel', 'TotalWorkingYears',
'TrainingTimesLastYear', 'WorkLifeBalance',
'Company Tenure (yrs)', 'YearsInCurrentRole',
'YearsSinceLastPromotion',
'YearsWithCurrManager'])])>
def get_feature_names(column_transformer):
"""Get feature names from all transformers.
Returns
-------
feature_names : list of strings
Names of the features produced by transform.
"""
# Remove the internal helper function
#check_is_fitted(column_transformer)
# Turn loopkup into function for better handling with pipeline later
def get_names(trans):
# >> Original get_feature_names() method
if trans == 'drop' or (
hasattr(column, '__len__') and not len(column)):
return []
if trans == 'passthrough':
if hasattr(column_transformer, '_df_columns'):
if ((not isinstance(column, slice))
and all(isinstance(col, str) for col in column)):
return column
else:
return column_transformer._df_columns[column]
else:
indices = np.arange(column_transformer._n_features)
return ['x%d' % i for i in indices[column]]
if not hasattr(trans, 'get_feature_names'):
# >>> Change: Return input column names if no method avaiable
# Turn error into a warning
warnings.warn("Transformer %s (type %s) does not "
"provide get_feature_names. "
"Will return input column names if available"
% (str(name), type(trans).__name__))
# For transformers without a get_features_names method, use the input
# names to the column transformer
if column is None:
return []
else:
return [name + "__" + f for f in column]
return [name + "__" + f for f in trans.get_feature_names()]
### Start of processing
feature_names = []
# Allow transformers to be pipelines. Pipeline steps are named differently, so preprocessing is needed
if type(column_transformer) == sklearn.pipeline.Pipeline:
l_transformers = [(name, trans, None, None) for step, name, trans in column_transformer._iter()]
else:
# For column transformers, follow the original method
l_transformers = list(column_transformer._iter(fitted=True))
for name, trans, column, _ in l_transformers:
if type(trans) == sklearn.pipeline.Pipeline:
# Recursive call on pipeline
_names = get_feature_names(trans)
# if pipeline has no transformer that returns names
if len(_names)==0:
_names = [name + "__" + f for f in column]
feature_names.extend(_names)
else:
feature_names.extend(get_names(trans))
return feature_names
feat_names = get_feature_names(column_trans)
<ipython-input-889-aeb52d502674>:30: UserWarning: Transformer ordinalencoder (type OrdinalEncoder) does not provide get_feature_names. Will return input column names if available
warnings.warn("Transformer %s (type %s) does not "
<ipython-input-889-aeb52d502674>:30: UserWarning: Transformer standardscaler (type StandardScaler) does not provide get_feature_names. Will return input column names if available
warnings.warn("Transformer %s (type %s) does not "
X_new.shape
(1470, 53)
X_new
array([[ 0. , 0. , 1. , ..., -0.0632959 ,
-0.67914568, 0.24583399],
[ 0. , 1. , 0. , ..., 0.76499762,
-0.36871529, 0.80654148],
[ 0. , 1. , 0. , ..., -1.16768726,
-0.67914568, -1.15593471],
...,
[ 0. , 1. , 0. , ..., -0.61549158,
-0.67914568, -0.31487349],
[ 0. , 0. , 1. , ..., 0.48889978,
-0.67914568, 1.08689522],
[ 0. , 1. , 0. , ..., -0.33939374,
-0.36871529, -0.59522723]])
from sklearn.model_selection import train_test_split
import sklearn.model_selection as model_selection
X_train,X_test,Y_train,Y_test=train_test_split(X_new,Y,test_size=0.20,random_state=200)
X_train.shape
(1176, 53)
X_test.shape
(294, 53)
Y_train.shape
(1176,)
Y_test.shape
(294,)
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1e5, penalty='l2', max_iter = 500)
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(Y_test, Y_pred))
Accuracy: 0.8741496598639455
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
param_grid = {
'n_estimators': [100, 250, 500],
'max_features': ['auto', 'sqrt', 'log2'],
'max_depth' : [5,6,7,8,9,10],
'criterion' :['gini', 'entropy']
}
,
rf = RandomForestClassifier()
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid,
cv = 9, n_jobs = -1, verbose = 2)
grid_search.fit(X_train, Y_train)
grid_search.best_params_
Fitting 9 folds for each of 108 candidates, totalling 972 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers. [Parallel(n_jobs=-1)]: Done 9 tasks | elapsed: 3.0s [Parallel(n_jobs=-1)]: Done 130 tasks | elapsed: 10.8s [Parallel(n_jobs=-1)]: Done 333 tasks | elapsed: 24.8s [Parallel(n_jobs=-1)]: Done 616 tasks | elapsed: 45.3s [Parallel(n_jobs=-1)]: Done 972 out of 972 | elapsed: 1.2min finished
{'criterion': 'gini',
'max_depth': 10,
'max_features': 'auto',
'n_estimators': 250}
model = grid_search.best_estimator_
Y_pred = model.predict(X_test)
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(Y_test, Y_pred))
Accuracy: 0.8741496598639455
from sklearn.metrics import classification_report,confusion_matrix
cr = classification_report(Y_test,Y_pred)
print(cr)
precision recall f1-score support
0 0.88 0.99 0.93 253
1 0.75 0.15 0.24 41
accuracy 0.87 294
macro avg 0.81 0.57 0.59 294
weighted avg 0.86 0.87 0.84 294
model
RandomForestClassifier(max_depth=10, n_estimators=250)
model.feature_importances_
array([0.00276199, 0.00872382, 0.00971095, 0.00318348, 0.00749233,
0.00745929, 0.00971985, 0.0023605 , 0.00796265, 0.0069103 ,
0.0076735 , 0.00227082, 0.00349829, 0.01162021, 0.00146143,
0.00240517, 0.00063344, 0.0073828 , 0.00670315, 0.00952692,
0.00736391, 0.0085661 , 0.01809692, 0. , 0.04029849,
0.03971613, 0.00635151, 0.00818799, 0.00772115, 0.01294218,
0.03325798, 0.03556215, 0.04849996, 0.02047774, 0.03176444,
0.03115271, 0.0221354 , 0.02298059, 0.02997191, 0.04558261,
0.03918754, 0.03199632, 0.0325047 , 0.00610841, 0.02206076,
0.03054476, 0.06272628, 0.02560096, 0.02441481, 0.04889998,
0.03100731, 0.02487155, 0.02998584])
feat_names
['onehotencoder__x0_Human Resources', 'onehotencoder__x0_Research & Development', 'onehotencoder__x0_Sales', 'onehotencoder__x1_Human Resources', 'onehotencoder__x1_Life Sciences', 'onehotencoder__x1_Marketing', 'onehotencoder__x1_Medical', 'onehotencoder__x1_Other', 'onehotencoder__x1_Technical Degree', 'onehotencoder__x2_Female', 'onehotencoder__x2_Male', 'onehotencoder__x3_Healthcare Representative', 'onehotencoder__x3_Human Resources', 'onehotencoder__x3_Laboratory Technician', 'onehotencoder__x3_Manager', 'onehotencoder__x3_Manufacturing Director', 'onehotencoder__x3_Research Director', 'onehotencoder__x3_Research Scientist', 'onehotencoder__x3_Sales Executive', 'onehotencoder__x3_Sales Representative', 'onehotencoder__x4_Divorced', 'onehotencoder__x4_Married', 'onehotencoder__x4_Single', 'onehotencoder__x5_Y', 'onehotencoder__x6_No', 'onehotencoder__x6_Yes', 'onehotencoder__x7_China', 'onehotencoder__x7_India', 'onehotencoder__x7_USA', 'ordinalencoder__BusinessTravel', 'standardscaler__Age', 'standardscaler__DailyRate', 'standardscaler__DistanceFromHome', 'standardscaler__Education', 'standardscaler__EnvironmentSatisfaction', 'standardscaler__HourlyRate', 'standardscaler__JobInvolvement', 'standardscaler__JobLevel', 'standardscaler__JobSatisfaction', 'standardscaler__MonthlyIncome', 'standardscaler__MonthlyRate', 'standardscaler__NumCompaniesWorked', 'standardscaler__PercentSalaryHike', 'standardscaler__PerformanceRating', 'standardscaler__RelationshipSatisfaction', 'standardscaler__StockOptionLevel', 'standardscaler__TotalWorkingYears', 'standardscaler__TrainingTimesLastYear', 'standardscaler__WorkLifeBalance', 'standardscaler__Company Tenure (yrs)', 'standardscaler__YearsInCurrentRole', 'standardscaler__YearsSinceLastPromotion', 'standardscaler__YearsWithCurrManager']
dictionary = dict(zip(feat_names, model.feature_importances_))
my_keys = sorted(dictionary, key=dictionary.get, reverse=True)[:3]
my_keys
['standardscaler__TotalWorkingYears', 'standardscaler__Company Tenure (yrs)', 'standardscaler__DistanceFromHome']
data_new.columns
Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
'DistanceFromHome', 'Education', 'EducationField',
'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement',
'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus',
'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'Over18',
'OverTime', 'PercentSalaryHike', 'PerformanceRating',
'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears',
'TrainingTimesLastYear', 'WorkLifeBalance', 'Company Tenure (yrs)',
'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager',
'Location'],
dtype='object')
features1 = ['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department']
features2 = ['DistanceFromHome', 'Educ_bucket', 'EducationField']
features3 = ['WorkLifeBalance_bucket', 'Company Tenure (yrs)', 'YearsInCurrentRole', 'YearsSinceLastPromotion']
features4 = [ 'JobLevel', 'JobRole', 'JobSatisfaction_bucket', 'MaritalStatus']
features5 = ['MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'Over18']
features6 = ['OverTime', 'PercentSalaryHike', 'PerformanceRating']
features7 = ['RelationshipSatisfaction_bucket', 'StockOptionLevel', 'TotalWorkingYears']
features8 = ['TrainingTimesLastYear', 'WorkLifeBalance_bucket', 'Company Tenure (yrs)_Bucket']
features9 = ['YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager', 'Location']
features10 = ['EnvironmentSatisfaction_Bucket', 'JobInvolvement_bucket']
data_new
| Age | Attrition | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EnvironmentSatisfaction | Gender | ... | RelationshipSatisfaction | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | Company Tenure (yrs) | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | Location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 40 | 1 | Travel_Rarely | 1100 | Sales | 1 | 2 | Life Sciences | 2 | Female | ... | 1 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 | India |
| 1 | 50 | 0 | Travel_Frequently | 300 | Research & Development | 8 | 1 | Life Sciences | 3 | Male | ... | 4 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 | China |
| 2 | 40 | 1 | Travel_Rarely | 1400 | Research & Development | 2 | 2 | Other | 4 | Male | ... | 2 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 | India |
| 3 | 30 | 0 | Travel_Frequently | 1400 | Research & Development | 3 | 4 | Life Sciences | 4 | Female | ... | 3 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 | India |
| 4 | 30 | 0 | Travel_Rarely | 600 | Research & Development | 2 | 1 | Medical | 1 | Male | ... | 4 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 | India |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1465 | 40 | 0 | Travel_Frequently | 900 | Research & Development | 23 | 2 | Medical | 3 | Male | ... | 3 | 1 | 17 | 3 | 3 | 5 | 2 | 0 | 3 | USA |
| 1466 | 40 | 0 | Travel_Rarely | 600 | Research & Development | 6 | 1 | Medical | 4 | Male | ... | 1 | 1 | 9 | 5 | 3 | 7 | 7 | 1 | 7 | USA |
| 1467 | 30 | 0 | Travel_Rarely | 200 | Research & Development | 4 | 3 | Life Sciences | 2 | Male | ... | 2 | 1 | 6 | 0 | 3 | 6 | 2 | 0 | 3 | USA |
| 1468 | 50 | 0 | Travel_Frequently | 1000 | Sales | 2 | 3 | Medical | 4 | Male | ... | 4 | 0 | 17 | 3 | 2 | 9 | 6 | 0 | 8 | USA |
| 1469 | 30 | 0 | Travel_Rarely | 600 | Research & Development | 8 | 3 | Medical | 2 | Male | ... | 1 | 0 | 6 | 3 | 4 | 4 | 3 | 1 | 2 | USA |
1470 rows × 33 columns
fig=plt.subplots(figsize=(50,50))
for i, j in enumerate(features1):
plt.subplot(3, 2, i+1)
plt.subplots_adjust(hspace = 1.0)
sns.countplot(x=j,data = data_original, hue='Attrition')
plt.xticks(rotation=90, fontsize = 30)
plt.yticks(rotation=00, fontsize = 30)
plt.xlabel(j, fontsize=40)
plt.ylabel('count', fontsize=40)
plt.title("No. of employee", fontsize=40)
fig=plt.subplots(figsize=(50,50))
for i, j in enumerate(features2):
plt.subplot(3, 2, i+1)
plt.subplots_adjust(hspace = 1.0)
sns.countplot(x=j,data = data_original, hue='Attrition')
plt.xticks(rotation=90, fontsize = 30)
plt.yticks(rotation=00, fontsize = 30)
plt.xlabel(j, fontsize=40)
plt.ylabel('count', fontsize=40)
plt.title("No. of employee", fontsize=40)
fig=plt.subplots(figsize=(50,50))
for i, j in enumerate(features3):
plt.subplot(3, 2, i+1)
plt.subplots_adjust(hspace = 1.0)
sns.countplot(x=j,data = data_original, hue='Attrition')
plt.xticks(rotation=90, fontsize = 30)
plt.yticks(rotation=00, fontsize = 30)
plt.xlabel(j, fontsize=40)
plt.ylabel('count', fontsize=40)
plt.title("No. of employee", fontsize=40)
fig=plt.subplots(figsize=(50,50))
for i, j in enumerate(features4):
plt.subplot(3, 2, i+1)
plt.subplots_adjust(hspace = 1.0)
sns.countplot(x=j,data = data_original, hue='Attrition')
plt.xticks(rotation=90, fontsize = 30)
plt.yticks(rotation=00, fontsize = 30)
plt.xlabel(j, fontsize=40)
plt.ylabel('count', fontsize=40)
plt.title("No. of employee", fontsize=40)
fig=plt.subplots(figsize=(50,50))
for i, j in enumerate(features5):
plt.subplot(3, 2, i+1)
plt.subplots_adjust(hspace = 1.0)
sns.countplot(x=j,data = data_original, hue='Attrition')
plt.xticks(rotation=90, fontsize = 30)
plt.yticks(rotation=00, fontsize = 30)
plt.xlabel(j, fontsize=40)
plt.ylabel('count', fontsize=40)
plt.title("No. of employee", fontsize=40)
fig=plt.subplots(figsize=(50,50))
for i, j in enumerate(features6):
plt.subplot(3, 2, i+1)
plt.subplots_adjust(hspace = 1.0)
sns.countplot(x=j,data = data_original, hue='Attrition')
plt.xticks(rotation=90, fontsize = 30)
plt.yticks(rotation=00, fontsize = 30)
plt.xlabel(j, fontsize=40)
plt.ylabel('count', fontsize=40)
plt.title("No. of employee", fontsize=40)
fig=plt.subplots(figsize=(50,50))
for i, j in enumerate(features7):
plt.subplot(3, 2, i+1)
plt.subplots_adjust(hspace = 1.0)
sns.countplot(x=j,data = data_original, hue='Attrition')
plt.xticks(rotation=90, fontsize = 30)
plt.yticks(rotation=00, fontsize = 30)
plt.xlabel(j, fontsize=40)
plt.ylabel('count', fontsize=40)
plt.title("No. of employee", fontsize=40)
fig=plt.subplots(figsize=(50,50))
for i, j in enumerate(features8):
plt.subplot(3, 2, i+1)
plt.subplots_adjust(hspace = 1.0)
sns.countplot(x=j,data = data_original, hue='Attrition')
plt.xticks(rotation=90, fontsize = 30)
plt.yticks(rotation=00, fontsize = 30)
plt.xlabel(j, fontsize=40)
plt.ylabel('count', fontsize=40)
plt.title("No. of employee", fontsize=40)
fig=plt.subplots(figsize=(50,50))
for i, j in enumerate(features9):
plt.subplot(3, 2, i+1)
plt.subplots_adjust(hspace = 1.0)
sns.countplot(x=j,data = data_original, hue='Attrition')
plt.xticks(rotation=90, fontsize = 30)
plt.yticks(rotation=00, fontsize = 30)
plt.xlabel(j, fontsize=40)
plt.ylabel('count', fontsize=40)
plt.title("No. of employee", fontsize=40)
fig=plt.subplots(figsize=(50,50))
for i, j in enumerate(features10):
plt.subplot(3, 2, i+1)
plt.subplots_adjust(hspace = 1.0)
sns.countplot(x=j,data = data_original, hue='Attrition')
plt.xticks(rotation=90, fontsize = 30)
plt.yticks(rotation=00, fontsize = 30)
plt.xlabel(j, fontsize=40)
plt.ylabel('count', fontsize=40)
plt.title("No. of employee", fontsize=40)